{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import h2o4gpu\n",
    "import time\n",
    "import sys\n",
    "import os\n",
    "import feather\n",
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#Can import data using pandas or feather.\n",
    "use_pandas = True\n",
    "\n",
    "#Define problem type\n",
    "classification = True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#Path to data\n",
    "\n",
    "#Logistic Regression Example\n",
    "if classification:\n",
    "    if use_pandas:\n",
    "        data_file = \"https://s3.amazonaws.com/h2o-public-test-data/h2o4gpu/open_data/creditcard.csv\" #If importing using pandas\n",
    "    else:\n",
    "        data_file = \"https://s3.amazonaws.com/h2o-public-test-data/h2o4gpu/open_data/credit.feather\"\n",
    "\n",
    "#Regression Example\n",
    "else:\n",
    "    if use_pandas:\n",
    "        data_file = \"../../../h2oai-prototypes/glm-bench/ipums.csv\" #If importing using pandas\n",
    "    else:\n",
    "        data_file = \"../../data/ipums.feather\"\n",
    "\n",
    "#Fraction to split validation set by\n",
    "valid_fraction = 0.2\n",
    "\n",
    "#Define if intercept should be used or not\n",
    "fit_intercept = True\n",
    "\n",
    "#Set up parameters for GPU GLM\n",
    "lambda_min_ratio = 1e-9 \n",
    "n_folds = 5\n",
    "n_lambdas = 20\n",
    "n_alphas = 3\n",
    "store_full_path = 0\n",
    "n_gpus = 1\n",
    "verbose = 0\n",
    "family = \"logistic\" if classification else \"elasticnet\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#Util to calculate logloss & rmse\n",
    "\n",
    "def ll(actual, predicted):\n",
    "    \"\"\"\n",
    "    Computes the log likelihood.\n",
    "    This function computes the log likelihood between two numbers,\n",
    "    or for element between a pair of lists or numpy arrays.\n",
    "    Parameters\n",
    "    ----------\n",
    "    actual : int, float, list of numbers, numpy array\n",
    "             The ground truth value\n",
    "    predicted : same type as actual\n",
    "                The predicted value\n",
    "    Returns\n",
    "    -------\n",
    "    score : double or list of doubles\n",
    "            The log likelihood error between actual and predicted\n",
    "    \"\"\"\n",
    "    actual = np.array(actual)\n",
    "    predicted = np.array(predicted)\n",
    "    for i in range(0,predicted.shape[0]):\n",
    "        predicted[i] = min(max(1e-15,predicted[i]),1-1e-15)\n",
    "    err = np.seterr(all='ignore')\n",
    "    score = -(actual*np.log(predicted)+(1-actual)*np.log(1-predicted))\n",
    "    np.seterr(divide=err['divide'], over=err['over'],\n",
    "              under=err['under'], invalid=err['invalid'])\n",
    "    if type(score)==np.ndarray:\n",
    "        score[np.isnan(score)] = 0\n",
    "    else:\n",
    "        if np.isnan(score):\n",
    "            score = 0\n",
    "    return score\n",
    "\n",
    "def log_loss(actual, predicted):\n",
    "    \"\"\"\n",
    "    Computes the log loss.\n",
    "    This function computes the log loss between two lists\n",
    "    of numbers.\n",
    "    Parameters\n",
    "    ----------\n",
    "    actual : list of numbers, numpy array\n",
    "             The ground truth value\n",
    "    predicted : same type as actual\n",
    "                The predicted value\n",
    "    Returns\n",
    "    -------\n",
    "    score : double\n",
    "            The log loss between actual and predicted\n",
    "    \"\"\"\n",
    "    return np.mean(ll(actual, predicted))\n",
    "\n",
    "def se(actual, predicted):\n",
    "    \"\"\"\n",
    "    Computes the squared error.\n",
    "    This function computes the squared error between two numbers,\n",
    "    or for element between a pair of lists or numpy arrays.\n",
    "    Parameters\n",
    "    ----------\n",
    "    actual : int, float, list of numbers, numpy array\n",
    "             The ground truth value\n",
    "    predicted : same type as actual\n",
    "                The predicted value\n",
    "    Returns\n",
    "    -------\n",
    "    score : double or list of doubles\n",
    "            The squared error between actual and predicted\n",
    "    \"\"\"\n",
    "    return np.power(np.array(actual)-np.array(predicted), 2)\n",
    "\n",
    "def mse(actual, predicted):\n",
    "    \"\"\"\n",
    "    Computes the mean squared error.\n",
    "    This function computes the mean squared error between two lists\n",
    "    of numbers.\n",
    "    Parameters\n",
    "    ----------\n",
    "    actual : list of numbers, numpy array\n",
    "             The ground truth value\n",
    "    predicted : same type as actual\n",
    "                The predicted value\n",
    "    Returns\n",
    "    -------\n",
    "    score : double\n",
    "            The mean squared error between actual and predicted\n",
    "    \"\"\"\n",
    "    return np.mean(se(actual, predicted))\n",
    "\n",
    "def rmse(actual, predicted):\n",
    "    \"\"\"\n",
    "    Computes the root mean squared error.\n",
    "    This function computes the root mean squared error between two lists\n",
    "    of numbers.\n",
    "    Parameters\n",
    "    ----------\n",
    "    actual : list of numbers, numpy array\n",
    "             The ground truth value\n",
    "    predicted : same type as actual\n",
    "                The predicted value\n",
    "    Returns\n",
    "    -------\n",
    "    score : double\n",
    "            The root mean squared error between actual and predicted\n",
    "    \"\"\"\n",
    "    return np.sqrt(mse(actual, predicted))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reading Data with Pandas\n",
      "(23999, 25)\n"
     ]
    }
   ],
   "source": [
    "if use_pandas:\n",
    "    print(\"Reading Data with Pandas\")\n",
    "    data = pd.read_csv(data_file)\n",
    "else:\n",
    "    print(\"Reading Data with Feather\")\n",
    "    data = feather.read_dataframe(data_file)\n",
    "print(data.shape)\n",
    "data_x = np.array(data.iloc[:, :data.shape[1] - 1], dtype='float32', order='C')\n",
    "data_y = np.array(data.iloc[:, data.shape[1] - 1], dtype='float32', order='C')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Original m=23999 n=24\n",
      "Size of Train rows=19200 & valid rows=4799\n",
      "Size of Train cols=24 valid cols=24\n",
      "Size of Train cols=25 & valid cols=25 after adding intercept column\n"
     ]
    }
   ],
   "source": [
    "#Setup train/validation set split (assuming form of mxn where m=row count and n=col count)\n",
    "morig = data_x.shape[0]\n",
    "norig = data_x.shape[1]\n",
    "print(\"Original m=%d n=%d\" % (morig, norig))\n",
    "sys.stdout.flush()\n",
    "\n",
    "#Do train/valid split\n",
    "valid_fraction = valid_fraction\n",
    "HO = int(valid_fraction * morig)\n",
    "H = morig - HO\n",
    "print(\"Size of Train rows=%d & valid rows=%d\" % (H, HO))\n",
    "sys.stdout.flush()\n",
    "train_x = np.copy(data_x[0:H, :])\n",
    "train_y = np.copy(data_y[0:H])\n",
    "valid_x = np.copy(data_x[H:morig, :])\n",
    "valid_y = np.copy(data_y[H:morig])\n",
    "print(\"Size of Train cols=%d valid cols=%d\" % (train_x.shape[1], valid_x.shape[1]))\n",
    "\n",
    "#Using intercept\n",
    "if fit_intercept:\n",
    "    train_x = np.hstack([train_x, np.ones((train_x.shape[0], 1), dtype=train_x.dtype)])\n",
    "    valid_x = np.hstack([valid_x, np.ones((valid_x.shape[0], 1), dtype=valid_x.dtype)])\n",
    "    n = train_x.shape[1]\n",
    "    print(\"Size of Train cols=%d & valid cols=%d after adding intercept column\" % (train_x.shape[1], valid_x.shape[1]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#Choose solver\n",
    "Solver = h2o4gpu.GLM"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Setting up solver\n"
     ]
    }
   ],
   "source": [
    "fortran = train_x.flags.f_contiguous #Row major vs Column major\n",
    "print(\"Setting up solver\")\n",
    "sys.stdout.flush()\n",
    "solver = Solver(\n",
    "              n_gpus=n_gpus, \n",
    "              order='c' if fortran else 'r', \n",
    "              fit_intercept=fit_intercept,\n",
    "              lambda_min_ratio=lambda_min_ratio,\n",
    "              n_lambdas=n_lambdas, \n",
    "              n_folds=n_folds, \n",
    "              n_alphas=n_alphas, \n",
    "              verbose=verbose, \n",
    "              family=family, \n",
    "              store_full_path=store_full_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Solving\n",
      "Done Solving\n"
     ]
    }
   ],
   "source": [
    "print(\"Solving\")\n",
    "fit = solver.fit(train_x, train_y)\n",
    "print(\"Done Solving\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Xvsalpha\n",
      "[[  2.22630342e-06  -4.85086105e-07  -8.17872882e-02  -1.10862039e-01\n",
      "   -1.54789597e-01   5.13954367e-03   5.72748542e-01   6.61257505e-02\n",
      "    8.27719048e-02  -7.26411492e-03   9.42052305e-02  -1.44003155e-02\n",
      "   -6.42589930e-06   2.91982656e-06   4.15507202e-07  -4.52019606e-07\n",
      "    9.12767348e-07   1.53325118e-06  -1.64839712e-05  -6.88376031e-06\n",
      "   -3.08774770e-06  -4.16595958e-06  -1.68937606e-06  -1.87233877e-06\n",
      "    1.36942859e-03  -6.61261082e-01]\n",
      " [  2.11968040e-06  -4.50975733e-07  -2.37388834e-02  -7.63612837e-02\n",
      "   -9.00145248e-02   5.85425133e-03   5.60724556e-01   6.25579506e-02\n",
      "    7.60950744e-02   0.00000000e+00   7.48864710e-02   0.00000000e+00\n",
      "   -6.67043605e-06   3.19906439e-06   4.69731475e-07  -7.39129746e-07\n",
      "    1.96886117e-06   7.15584065e-07  -1.70692419e-05  -7.29354133e-06\n",
      "   -3.12508791e-06  -5.21898755e-06  -9.81220637e-07  -2.07797939e-06\n",
      "    0.00000000e+00  -9.35642660e-01]\n",
      " [  2.31712875e-06  -4.52045981e-07  -3.96523178e-02  -8.53509903e-02\n",
      "   -1.09921202e-01   5.14200469e-03   5.71345329e-01   5.95524311e-02\n",
      "    7.62892663e-02   0.00000000e+00   8.03619623e-02   0.00000000e+00\n",
      "   -6.53949428e-06   3.07814730e-06   5.16414673e-07  -7.75175067e-07\n",
      "    1.98056387e-06   6.19125444e-07  -1.74877387e-05  -6.82556129e-06\n",
      "   -3.20552090e-06  -4.40014992e-06  -6.48715968e-07  -1.86076295e-06\n",
      "    0.00000000e+00  -8.44296694e-01]]\n",
      "np.shape(Xvsalpha)\n",
      "(3, 26)\n",
      "logloss_train\n",
      "[[ 0.47700763  0.47543135 -1.        ]\n",
      " [ 0.47717693  0.47582042 -1.        ]\n",
      " [ 0.47711235  0.4755854  -1.        ]]\n",
      "Best lambdas\n",
      "[[ 21.76660919]\n",
      " [ 69.19573212]\n",
      " [ 27.60074806]]\n",
      "Best alphas\n",
      "[[ 0. ]\n",
      " [ 0.5]\n",
      " [ 1. ]]\n",
      "Best tols\n",
      "[[ 0.01]\n",
      " [ 0.01]\n",
      " [ 0.01]]\n"
     ]
    }
   ],
   "source": [
    "# Show something about Xvsalphalambda or Xvsalpha\n",
    "print(\"Xvsalpha\")\n",
    "print(fit.x_vs_alphapure)\n",
    "\n",
    "print(\"np.shape(Xvsalpha)\")\n",
    "print(np.shape(fit.x_vs_alphapure))\n",
    "\n",
    "error_train = fit.error_vs_alpha\n",
    "if classification:\n",
    "    print(\"logloss_train\")\n",
    "else:\n",
    "    print(\"rmse_train\")\n",
    "print(error_train)\n",
    "\n",
    "print(\"Best lambdas\")\n",
    "lambdas = fit.lambdas_best\n",
    "print(lambdas)\n",
    "\n",
    "print(\"Best alphas\")\n",
    "alphas = fit.alphas_best\n",
    "print(alphas)\n",
    "\n",
    "print(\"Best tols\")\n",
    "tols = fit.tols_best\n",
    "print(tols)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[ 0.7718392   0.14170469  0.14292425 ...,  0.26786035  0.24226156\n",
      "   0.1750378 ]\n",
      " [ 0.76714426  0.14424451  0.1507562  ...,  0.25439146  0.24270132\n",
      "   0.18371987]\n",
      " [ 0.7737245   0.14222543  0.14817348 ...,  0.25907847  0.24351341\n",
      "   0.18025337]]\n"
     ]
    }
   ],
   "source": [
    "#Make predictions on validation\n",
    "preds = fit.predict(valid_x, valid_y)\n",
    "print(preds)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Logloss for alpha =  [ 0.]\n",
      "0.439094\n",
      "Logloss for alpha =  [ 0.5]\n",
      "0.439265\n",
      "Logloss for alpha =  [ 1.]\n",
      "0.439326\n"
     ]
    }
   ],
   "source": [
    "#Get logloss or rmse for validation set per alpha\n",
    "for i in range(n_alphas):\n",
    "    if classification:\n",
    "        print(\"Logloss for alpha = \",alphas[i])\n",
    "        print(log_loss(valid_y, preds[i]))\n",
    "    else:\n",
    "        print(\"RMSE for alpha = \",alphas[i])\n",
    "        print(rmse(valid_y,preds[i]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.6.1(pyenv)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
